From 6cfe1d4b8bf5aad4c3bc634fe11acdf4586e51c7 Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Wed, 19 Nov 2003 17:22:42 +0000 Subject: [PATCH] bitkeeper revision 1.612 (3fbba6e271_EVTI6k2Ndd7VThsLqTA) Many files: new file Makefile: New library (libxi) which makes it easier to access the control interfaces. --- .rootkeys | 13 + tools/Makefile | 3 + tools/libxi/Makefile | 48 +++ tools/libxi/libxi_bvtsched.c | 33 ++ tools/libxi/libxi_domain.c | 80 +++++ tools/libxi/libxi_linux_build.c | 481 ++++++++++++++++++++++++++++++ tools/libxi/libxi_linux_restore.c | 476 +++++++++++++++++++++++++++++ tools/libxi/libxi_linux_save.c | 380 +++++++++++++++++++++++ tools/libxi/libxi_misc.c | 50 ++++ tools/libxi/libxi_private.c | 34 +++ tools/libxi/libxi_private.h | 155 ++++++++++ tools/libxi/libxi_vbd.c | 116 +++++++ tools/libxi/libxi_vif.c | 66 ++++ tools/libxi/rpm.spec | 28 ++ tools/libxi/xi.h | 99 ++++++ 15 files changed, 2062 insertions(+) create mode 100644 tools/libxi/Makefile create mode 100644 tools/libxi/libxi_bvtsched.c create mode 100644 tools/libxi/libxi_domain.c create mode 100644 tools/libxi/libxi_linux_build.c create mode 100644 tools/libxi/libxi_linux_restore.c create mode 100644 tools/libxi/libxi_linux_save.c create mode 100644 tools/libxi/libxi_misc.c create mode 100644 tools/libxi/libxi_private.c create mode 100644 tools/libxi/libxi_private.h create mode 100644 tools/libxi/libxi_vbd.c create mode 100644 tools/libxi/libxi_vif.c create mode 100644 tools/libxi/rpm.spec create mode 100644 tools/libxi/xi.h diff --git a/.rootkeys b/.rootkeys index d790137e75..91e5c8213b 100644 --- a/.rootkeys +++ b/.rootkeys @@ -184,6 +184,19 @@ 3fb01fd5B-UeibZkmSCOUZckNyNFYA tools/internal/xi_vbd_list.c 3f86be322bd0h9jG3krZFOUgCDoxZg tools/internal/xi_vif_params.c 3eb781fd7211MZsLxJSiuy7W4KnJXg tools/internal/xi_vifinit +3fbba6dbDfYvJSsw9500b4SZyUhxjQ tools/libxi/Makefile +3fbba6dbEVkVMX0JuDFzap9jeaucGA tools/libxi/libxi_bvtsched.c +3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/libxi/libxi_domain.c +3fbba6dbNCU7U6nsMYiXzKkp3ztaJg tools/libxi/libxi_linux_build.c +3fbba6dbl267zZOAVHYLOdLCdhcZMw tools/libxi/libxi_linux_restore.c +3fbba6db7li3FJiABYtCmuGxOJxEGw tools/libxi/libxi_linux_save.c +3fbba6db7WnnJr0KFrIFrqNlSKvFYg tools/libxi/libxi_misc.c +3fbba6dctWRWlFJkYb6hdix2X4WMuw tools/libxi/libxi_private.c +3fbba6dcbVrG2hPzEzwdeV_UC8kydQ tools/libxi/libxi_private.h +3fbba6dcoGq9hQlksrBUfC2P5F6sGg tools/libxi/libxi_vbd.c +3fbba6dc38q-ioRlwSR_quw4G3qUeQ tools/libxi/libxi_vif.c +3fbba6dc1uU7U3IFeF6A-XEOYF2MkQ tools/libxi/rpm.spec +3fbba6dcrNxtygEcgJYAJJ1gCQqfsA tools/libxi/xi.h 3f776bd2Xd-dUcPKlPN2vG89VGtfvQ tools/misc/Makefile 3f6dc136ZKOjd8PIqLbFBl_v-rnkGg tools/misc/miniterm/Makefile 3f6dc140C8tAeBfroAF24VrmCS4v_w tools/misc/miniterm/README diff --git a/tools/Makefile b/tools/Makefile index 590d8ca21b..c384b7f2fe 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -21,17 +21,20 @@ all: $(MAKE) -C balloon $(MAKE) -C control $(MAKE) -C internal + $(MAKE) -C libxi $(MAKE) -C misc install: all $(MAKE) -C balloon install $(MAKE) -C control install $(MAKE) -C internal install + $(MAKE) -C libxi install $(MAKE) -C misc install clean: $(MAKE) -C balloon clean $(MAKE) -C control clean $(MAKE) -C internal clean + $(MAKE) -C libxi clean $(MAKE) -C misc clean diff --git a/tools/libxi/Makefile b/tools/libxi/Makefile new file mode 100644 index 0000000000..b776fa83a2 --- /dev/null +++ b/tools/libxi/Makefile @@ -0,0 +1,48 @@ + +CC = gcc +CFLAGS = -c -Wall -O3 +CFLAGS += -I../../xen/include -I../../xenolinux-sparse/include + +HDRS = $(wildcard *.h) +OBJS = $(patsubst %.c,%.o,$(wildcard libxi_*.c)) + +LIBS = libxi.a libxi.so + +all: check-for-zlib $(LIBS) + ranlib libxi.a + +check-for-zlib: + @if [ ! -e /usr/include/zlib.h ]; then \ + echo "***********************************************************"; \ + echo "ERROR: install zlib header files (http://www.gzip.org/zlib)"; \ + echo "***********************************************************"; \ + false; \ + fi + +install: all + mkdir -p ../../../install/lib + mkdir -p ../../../install/include + cp -a $(LIBS) ../../../install/lib + for i in $(LIBS) do ; chmod 755 ../../../install/bin/$i ; done + cp -a xi.h ../../../install/include + chmod 644 ../../../install/include/xi.h + +clean: + $(RM) *.a *.so *.o *.rpm $(LIBS) + +rpm: all + rm -rf staging + mkdir staging + mkdir staging/i386 + rpmbuild --define "staging$$PWD/staging" --define '_builddir.' \ + --define "_rpmdir$$PWD/staging" -bb rpm.spec + mv staging/i386/*.rpm . + rm -rf staging + +libxi.so: $(OBJS) + $(LD) -shared -o $@ $^ -lz + +libxi.a: libxi.a($(OBJS)) + +%.o: %.c $(HDRS) Makefile + $(CC) $(CFLAGS) -o $@ $< diff --git a/tools/libxi/libxi_bvtsched.c b/tools/libxi/libxi_bvtsched.c new file mode 100644 index 0000000000..af6fd17fcb --- /dev/null +++ b/tools/libxi/libxi_bvtsched.c @@ -0,0 +1,33 @@ +/****************************************************************************** + * libxi_bvtsched.c + * + * API for manipulating parameters of the Borrowed Virtual Time scheduler. + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "libxi_private.h" + +int xi_bvtsched_global_set(unsigned long ctx_allow) +{ + dom0_op_t op; + op.cmd = DOM0_BVTCTL; + op.u.bvtctl.ctx_allow = ctx_allow; + return do_dom0_op(&op); +} + +int xi_bvtsched_domain_set(unsigned int domid, + unsigned long mcuadv, + unsigned long warp, + unsigned long warpl, + unsigned long warpu) +{ + dom0_op_t op; + op.cmd = DOM0_ADJUSTDOM; + op.u.adjustdom.domain = domid; + op.u.adjustdom.mcu_adv = mcuadv; + op.u.adjustdom.warp = warp; + op.u.adjustdom.warpl = warpl; + op.u.adjustdom.warpu = warpu; + return do_dom0_op(&op); +} diff --git a/tools/libxi/libxi_domain.c b/tools/libxi/libxi_domain.c new file mode 100644 index 0000000000..ad75ef5701 --- /dev/null +++ b/tools/libxi/libxi_domain.c @@ -0,0 +1,80 @@ +/****************************************************************************** + * libxi_domain.c + * + * API for manipulating and obtaining information on domains. + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "libxi_private.h" + +int xi_domain_create(unsigned int mem_kb, const char *name) +{ + int err; + dom0_op_t op; + + op.cmd = DOM0_CREATEDOMAIN; + op.u.createdomain.memory_kb = mem_kb; + strncpy(op.u.createdomain.name, name, MAX_DOMAIN_NAME); + op.u.createdomain.name[MAX_DOMAIN_NAME-1] = '\0'; + + err = do_dom0_op(&op); + + return (err < 0) ? err : op.u.createdomain.domain; +} + + +int xi_domain_start(unsigned int domid) +{ + dom0_op_t op; + op.cmd = DOM0_STARTDOMAIN; + op.u.startdomain.domain = domid; + return do_dom0_op(&op); +} + + +int xi_domain_stop(unsigned int domid) +{ + dom0_op_t op; + op.cmd = DOM0_STOPDOMAIN; + op.u.stopdomain.domain = domid; + return do_dom0_op(&op); +} + + +int xi_domain_destroy(unsigned int domid, int force) +{ + dom0_op_t op; + op.cmd = DOM0_DESTROYDOMAIN; + op.u.destroydomain.domain = domid; + op.u.destroydomain.force = !!force; + return do_dom0_op(&op); +} + +int xi_domain_getinfo(unsigned int first_domid, + unsigned int max_doms, + xi_dominfo_t *info) +{ + unsigned int nr_doms, next_domid = first_domid; + dom0_op_t op; + + for ( nr_doms = 0; nr_doms < max_doms; nr_doms++ ) + { + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = next_domid; + if ( do_dom0_op(&op) < 0 ) + break; + info->domid = op.u.getdomaininfo.domain; + info->cpu = op.u.getdomaininfo.processor; + info->has_cpu = op.u.getdomaininfo.has_cpu; + info->stopped = (op.u.getdomaininfo.state == DOMSTATE_STOPPED); + info->nr_pages = op.u.getdomaininfo.tot_pages; + info->cpu_time = op.u.getdomaininfo.cpu_time; + strncpy(info->name, op.u.getdomaininfo.name, XI_DOMINFO_MAXNAME); + info->name[XI_DOMINFO_MAXNAME-1] = '\0'; + + next_domid = op.u.getdomaininfo.domain + 1; + } + + return nr_doms; +} diff --git a/tools/libxi/libxi_linux_build.c b/tools/libxi/libxi_linux_build.c new file mode 100644 index 0000000000..ff9bd354eb --- /dev/null +++ b/tools/libxi/libxi_linux_build.c @@ -0,0 +1,481 @@ +/****************************************************************************** + * libxi_linux_build.c + */ + +#include "libxi_private.h" +#include + +/* This string is written to the head of every guest kernel image. */ +#define GUEST_SIG "XenoGues" +#define SIG_LEN 8 + +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) + +static long get_tot_pages(int domid) +{ + dom0_op_t op; + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = domid; + return (do_dom0_op(&op) < 0) ? -1 : op.u.getdomaininfo.tot_pages; +} + +static int get_pfn_list( + int domid, unsigned long *pfn_buf, unsigned long max_pfns) +{ + dom0_op_t op; + int ret; + op.cmd = DOM0_GETMEMLIST; + op.u.getmemlist.domain = domid; + op.u.getmemlist.max_pfns = max_pfns; + op.u.getmemlist.buffer = pfn_buf; + + if ( mlock(pfn_buf, max_pfns * sizeof(unsigned long)) != 0 ) + return -1; + + ret = do_dom0_op(&op); + + (void)munlock(pfn_buf, max_pfns * sizeof(unsigned long)); + + return (ret < 0) ? -1 : op.u.getmemlist.num_pfns; +} + +static int send_pgupdates(mmu_update_t *updates, int nr_updates) +{ + int ret = -1; + privcmd_hypercall_t hypercall; + + hypercall.op = __HYPERVISOR_mmu_update; + hypercall.arg[0] = (unsigned long)updates; + hypercall.arg[1] = (unsigned long)nr_updates; + + if ( mlock(updates, nr_updates * sizeof(*updates)) != 0 ) + goto out1; + + if ( do_xen_hypercall(&hypercall) < 0 ) + goto out2; + + ret = 0; + + out2: (void)munlock(updates, nr_updates * sizeof(*updates)); + out1: return ret; +} + +/* Read the kernel header, extracting the image size and load address. */ +static int read_kernel_header(gzFile gfd, long dom_size, + unsigned long *load_addr, int verbose) +{ + char signature[SIG_LEN]; + + gzread(gfd, signature, SIG_LEN); + if ( strncmp(signature, GUEST_SIG, SIG_LEN) ) + { + if ( verbose ) + ERROR("Kernel image does not contain required signature"); + return -1; + } + + /* Read the load address which immediately follows the Xeno signature. */ + gzread(gfd, load_addr, sizeof(unsigned long)); + + return 0; +} + +static int copy_to_domain_page(unsigned long dst_pfn, void *src_page) +{ + void *vaddr = map_pfn(dst_pfn); + if ( vaddr == NULL ) + return -1; + memcpy(vaddr, src_page, PAGE_SIZE); + unmap_pfn(vaddr); + return 0; +} + +static int setup_guestos( + int dom, gzFile kernel_gfd, int initrd_fd, unsigned long tot_pages, + unsigned long *virt_startinfo_addr, unsigned long virt_load_addr, + dom0_builddomain_t *builddomain, const char *cmdline, + unsigned long shared_info_frame, int verbose) +{ + l1_pgentry_t *vl1tab = NULL, *vl1e = NULL; + l2_pgentry_t *vl2tab = NULL, *vl2e = NULL; + unsigned long *page_array = NULL; + mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL; + int alloc_index, num_pt_pages; + unsigned long l2tab; + unsigned long l1tab = 0; + unsigned long num_pgt_updates = 0; + unsigned long count, pt_start, i, j; + unsigned long initrd_addr = 0, initrd_len = 0; + start_info_t *start_info; + shared_info_t *shared_info; + unsigned long ksize; + + memset(builddomain, 0, sizeof(*builddomain)); + + if ( init_pfn_mapper() < 0 ) + goto error_out; + + pgt_updates = malloc((tot_pages + 1024) * 3 * sizeof(mmu_update_t)); + page_array = malloc(tot_pages * sizeof(unsigned long)); + pgt_update_arr = pgt_updates; + if ( (pgt_update_arr == NULL) || (page_array == NULL) ) + { + if ( verbose ) + PERROR("Could not allocate memory"); + goto error_out; + } + + if ( get_pfn_list(dom, page_array, tot_pages) != tot_pages ) + { + if ( verbose ) + PERROR("Could not get the page frame list"); + goto error_out; + } + + /* Load the guest OS image. Let it take no more than 1/2 memory.*/ + for ( i = 0; i < ((tot_pages/2)*PAGE_SIZE); i += PAGE_SIZE ) + { + char page[PAGE_SIZE]; + int size; + if ( (size = gzread(kernel_gfd, page, PAGE_SIZE)) == -1 ) + { + if ( verbose ) + PERROR("Error reading kernel image, could not" + " read the whole image."); + goto error_out; + } + if ( size == 0 ) + goto kernel_copied; + copy_to_domain_page(page_array[i>>PAGE_SHIFT], page); + } + if ( verbose ) + ERROR("Kernel too big to safely fit in domain memory"); + goto error_out; + + kernel_copied: + /* ksize is kernel-image size rounded up to a page boundary. */ + ksize = i; + + /* Load the initial ramdisk image. */ + if ( initrd_fd >= 0 ) + { + struct stat stat; + unsigned long isize; + + if ( fstat(initrd_fd, &stat) < 0 ) + { + if ( verbose ) + PERROR("Could not stat the initrd image"); + goto error_out; + } + isize = stat.st_size; + if ( (isize + ksize) > ((tot_pages/2) * PAGE_SIZE) ) + { + if ( verbose ) + ERROR("Kernel/initrd too big to safely fit in domain memory"); + goto error_out; + } + + initrd_addr = virt_load_addr + ksize; + initrd_len = isize; + + for ( j = 0, i = ksize; j < isize; j += PAGE_SIZE, i += PAGE_SIZE ) + { + char page[PAGE_SIZE]; + int size = ((isize-j) < PAGE_SIZE) ? (isize-j) : PAGE_SIZE; + if ( read(initrd_fd, page, size) != size ) + { + if ( verbose ) + PERROR("Error reading initrd image, could not" + " read the whole image."); + goto error_out; + } + copy_to_domain_page(page_array[i>>PAGE_SHIFT], page); + } + } + + alloc_index = tot_pages - 1; + + /* Count bottom-level PTs, rounding up. */ + num_pt_pages = (l1_table_offset(virt_load_addr) + tot_pages + 1023) / 1024; + + /* We must also count the page directory. */ + num_pt_pages++; + + /* Index of first PT page. */ + pt_start = tot_pages - num_pt_pages; + + /* + * First allocate page for page dir. Allocation goes backwards from the end + * of the allocated physical address space. + */ + l2tab = page_array[alloc_index] << PAGE_SHIFT; + alloc_index--; + builddomain->ctxt.pt_base = l2tab; + + /* + * Pin down l2tab addr as page dir page - causes hypervisor to provide + * correct protection for the page + */ + pgt_updates->ptr = l2tab | MMU_EXTENDED_COMMAND; + pgt_updates->val = MMUEXT_PIN_L2_TABLE; + pgt_updates++; + num_pgt_updates++; + + /* Initialise the page tables. */ + if ( (vl2tab = map_pfn(l2tab >> PAGE_SHIFT)) == NULL ) + goto error_out; + memset(vl2tab, 0, PAGE_SIZE); + vl2e = vl2tab + l2_table_offset(virt_load_addr); + for ( count = 0; count < tot_pages; count++ ) + { + if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) + { + l1tab = page_array[alloc_index] << PAGE_SHIFT; + if ( (vl1tab = map_pfn(l1tab >> PAGE_SHIFT)) == NULL ) + goto error_out; + memset(vl1tab, 0, PAGE_SIZE); + alloc_index--; + + vl1e = vl1tab + l1_table_offset(virt_load_addr + + (count << PAGE_SHIFT)); + + /* make apropriate entry in the page directory */ + pgt_updates->ptr = (unsigned long)vl2e; + pgt_updates->val = l1tab | L2_PROT; + pgt_updates++; + num_pgt_updates++; + vl2e++; + } + + if ( count < pt_start ) + { + pgt_updates->ptr = (unsigned long)vl1e; + pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT; + pgt_updates++; + num_pgt_updates++; + vl1e++; + } + else + { + pgt_updates->ptr = (unsigned long)vl1e; + pgt_updates->val = + ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW; + pgt_updates++; + num_pgt_updates++; + vl1e++; + } + + pgt_updates->ptr = + (page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + pgt_updates->val = count; + pgt_updates++; + num_pgt_updates++; + } + + *virt_startinfo_addr = + virt_load_addr + ((alloc_index-1) << PAGE_SHIFT); + + start_info = map_pfn(page_array[alloc_index-1]); + memset(start_info, 0, sizeof(*start_info)); + start_info->pt_base = virt_load_addr + ((tot_pages-1) << PAGE_SHIFT); + start_info->mod_start = initrd_addr; + start_info->mod_len = initrd_len; + start_info->nr_pages = tot_pages; + start_info->shared_info = shared_info_frame << PAGE_SHIFT; + start_info->dom_id = dom; + start_info->flags = 0; + strncpy(start_info->cmd_line, cmdline, MAX_CMD_LEN); + start_info->cmd_line[MAX_CMD_LEN-1] = '\0'; + + unmap_pfn(start_info); + + /* shared_info page starts its life empty. */ + shared_info = map_pfn(shared_info_frame); + memset(shared_info, 0, PAGE_SIZE); + unmap_pfn(shared_info); + + /* Send the page update requests down to the hypervisor. */ + if ( send_pgupdates(pgt_update_arr, num_pgt_updates) < 0 ) + goto error_out; + + free(page_array); + free(pgt_update_arr); + return 0; + + error_out: + if ( page_array == NULL ) + free(page_array); + if ( pgt_update_arr == NULL ) + free(pgt_update_arr); + return -1; +} + +int xi_domain_build(unsigned int domid, + const char *image_name, + const char *ramdisk_name, + const char *cmdline, + int verbose) +{ + dom0_op_t launch_op, op; + unsigned long load_addr; + long tot_pages; + int kernel_fd, initrd_fd = -1; + gzFile kernel_gfd; + int rc, i; + full_execution_context_t *ctxt; + unsigned long virt_startinfo_addr; + + if ( (tot_pages = get_tot_pages(domid)) < 0 ) + { + if ( verbose ) + PERROR("Could not find total pages for domain"); + return 1; + } + + kernel_fd = open(image_name, O_RDONLY); + if ( kernel_fd < 0 ) + { + if ( verbose ) + PERROR("Could not open kernel image"); + return 1; + } + + if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) + { + if ( verbose ) + PERROR("Could not allocate decompression state for state file"); + close(kernel_fd); + return 1; + } + + rc = read_kernel_header(kernel_gfd, + tot_pages << (PAGE_SHIFT - 10), + &load_addr, verbose); + if ( rc < 0 ) + goto error_out; + + if ( (load_addr & (PAGE_SIZE-1)) != 0 ) + { + if ( verbose ) + ERROR("We can only deal with page-aligned load addresses"); + goto error_out; + } + + if ( (load_addr + (tot_pages << PAGE_SHIFT)) > HYPERVISOR_VIRT_START ) + { + if ( verbose ) + ERROR("Cannot map all domain memory without hitting Xen space"); + goto error_out; + } + + if ( ramdisk_name != NULL ) + { + initrd_fd = open(ramdisk_name, O_RDONLY); + if ( initrd_fd < 0 ) + { + if ( verbose ) + PERROR("Could not open the initial ramdisk image"); + goto error_out; + } + } + + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = domid; + if ( (do_dom0_op(&op) < 0) || (op.u.getdomaininfo.domain != domid) ) + { + if ( verbose ) + PERROR("Could not get info on domain"); + goto error_out; + } + if ( (op.u.getdomaininfo.state != DOMSTATE_STOPPED) || + (op.u.getdomaininfo.ctxt.pt_base != 0) ) + { + if ( verbose ) + ERROR("Domain is already constructed"); + goto error_out; + } + + if ( setup_guestos(domid, kernel_gfd, initrd_fd, tot_pages, + &virt_startinfo_addr, + load_addr, &launch_op.u.builddomain, cmdline, + op.u.getdomaininfo.shared_info_frame, verbose) < 0 ) + { + if ( verbose ) + ERROR("Error constructing guest OS"); + goto error_out; + } + + if ( initrd_fd >= 0 ) + close(initrd_fd); + gzclose(kernel_gfd); + + ctxt = &launch_op.u.builddomain.ctxt; + + ctxt->flags = 0; + + /* + * Initial register values: + * DS,ES,FS,GS = FLAT_RING1_DS + * CS:EIP = FLAT_RING1_CS:start_pc + * SS:ESP = FLAT_RING1_DS:start_stack + * ESI = start_info + * [EAX,EBX,ECX,EDX,EDI,EBP are zero] + * EFLAGS = IF | 2 (bit 1 is reserved and should always be 1) + */ + ctxt->i386_ctxt.ds = FLAT_RING1_DS; + ctxt->i386_ctxt.es = FLAT_RING1_DS; + ctxt->i386_ctxt.fs = FLAT_RING1_DS; + ctxt->i386_ctxt.gs = FLAT_RING1_DS; + ctxt->i386_ctxt.ss = FLAT_RING1_DS; + ctxt->i386_ctxt.cs = FLAT_RING1_CS; + ctxt->i386_ctxt.eip = load_addr; + ctxt->i386_ctxt.esp = virt_startinfo_addr; + ctxt->i386_ctxt.esi = virt_startinfo_addr; + ctxt->i386_ctxt.eflags = (1<<9) | (1<<2); + + /* FPU is set up to default initial state. */ + memset(ctxt->i387_ctxt, 0, sizeof(ctxt->i387_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt->trap_ctxt[i].vector = i; + ctxt->trap_ctxt[i].cs = FLAT_RING1_CS; + } + ctxt->fast_trap_idx = 0; + + /* No LDT. */ + ctxt->ldt_ents = 0; + + /* Use the default Xen-provided GDT. */ + ctxt->gdt_ents = 0; + + /* Ring 1 stack is the initial stack. */ + ctxt->ring1_ss = FLAT_RING1_DS; + ctxt->ring1_esp = virt_startinfo_addr; + + /* No debugging. */ + memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg)); + + /* No callback handlers. */ + ctxt->event_callback_cs = FLAT_RING1_CS; + ctxt->event_callback_eip = 0; + ctxt->failsafe_callback_cs = FLAT_RING1_CS; + ctxt->failsafe_callback_eip = 0; + + launch_op.u.builddomain.domain = domid; + launch_op.u.builddomain.num_vifs = 1; + + launch_op.cmd = DOM0_BUILDDOMAIN; + rc = do_dom0_op(&launch_op); + + return rc; + + error_out: + if ( initrd_fd >= 0 ) + close(initrd_fd); + gzclose(kernel_gfd); + return -1; +} diff --git a/tools/libxi/libxi_linux_restore.c b/tools/libxi/libxi_linux_restore.c new file mode 100644 index 0000000000..25d927a95d --- /dev/null +++ b/tools/libxi/libxi_linux_restore.c @@ -0,0 +1,476 @@ +/****************************************************************************** + * libxi_linux_restore.c + * + * Restore the state of a Xenolinux session. + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "libxi_private.h" +#include +#include + +/* This may allow us to create a 'quiet' command-line option, if necessary. */ +#define verbose_printf(_f, _a...) \ + do { \ + if ( !verbose ) break; \ + printf( _f , ## _a ); \ + fflush(stdout); \ + } while ( 0 ) + +static int get_pfn_list( + int domain_id, unsigned long *pfn_buf, unsigned long max_pfns) +{ + dom0_op_t op; + int ret; + op.cmd = DOM0_GETMEMLIST; + op.u.getmemlist.domain = domain_id; + op.u.getmemlist.max_pfns = max_pfns; + op.u.getmemlist.buffer = pfn_buf; + + if ( mlock(pfn_buf, max_pfns * sizeof(unsigned long)) != 0 ) + { + PERROR("Could not lock pfn list buffer"); + return -1; + } + + ret = do_dom0_op(&op); + + (void)munlock(pfn_buf, max_pfns * sizeof(unsigned long)); + + return (ret < 0) ? -1 : op.u.getmemlist.num_pfns; +} + +#define MAX_MMU_UPDATES 1024 + +static int flush_mmu_updates(mmu_update_t *mmu_updates, + int *mmu_update_idx) +{ + int err = 0; + privcmd_hypercall_t hypercall; + + if ( *mmu_update_idx == 0 ) + return 0; + + hypercall.op = __HYPERVISOR_mmu_update; + hypercall.arg[0] = (unsigned long)mmu_updates; + hypercall.arg[1] = (unsigned long)*mmu_update_idx; + + if ( mlock(mmu_updates, sizeof(mmu_updates)) != 0 ) + { + PERROR("Could not lock pagetable update array"); + err = 1; + goto out; + } + + if ( do_xen_hypercall(&hypercall) < 0 ) + { + ERROR("Failure when submitting mmu updates"); + err = 1; + } + + *mmu_update_idx = 0; + + (void)munlock(mmu_updates, sizeof(mmu_updates)); + + out: + return err; +} + +static int add_mmu_update(mmu_update_t *mmu_updates, + int *mmu_update_idx, + unsigned long ptr, + unsigned long val) +{ + mmu_updates[*mmu_update_idx].ptr = ptr; + mmu_updates[*mmu_update_idx].val = val; + if ( ++*mmu_update_idx == MAX_MMU_UPDATES ) + return flush_mmu_updates(mmu_updates, mmu_update_idx); + return 0; +} + +static int checked_read(gzFile fd, void *buf, size_t count) +{ + int rc; + while ( ((rc = gzread(fd, buf, count)) == -1) && (errno == EINTR) ) + continue; + return rc == count; +} + +int xi_linux_restore(const char *state_file, int verbose) +{ + dom0_op_t op; + int rc = 1, i, j; + unsigned long mfn, pfn, dom = 0; + unsigned int prev_pc, this_pc; + + /* Number of page frames in use by this XenoLinux session. */ + unsigned long nr_pfns; + + /* The new domain's shared-info frame number. */ + unsigned long shared_info_frame; + unsigned char shared_info[PAGE_SIZE]; /* saved contents from file */ + + /* A copy of the CPU context of the guest. */ + full_execution_context_t ctxt; + + /* First 16 bytes of the state file must contain 'XenoLinuxSuspend'. */ + char signature[16]; + + /* A copy of the domain's name. */ + char name[MAX_DOMAIN_NAME]; + + /* A table containg the type of each PFN (/not/ MFN!). */ + unsigned long *pfn_type = NULL; + + /* A temporary mapping, and a copy, of one frame of guest memory. */ + unsigned long *ppage, page[1024]; + + /* A copy of the pfn-to-mfn table frame list. */ + unsigned long pfn_to_mfn_frame_list[1024]; + + /* A table mapping each PFN to its new MFN. */ + unsigned long *pfn_to_mfn_table = NULL; + + /* A temporary mapping of the guest's suspend record. */ + suspend_record_t *p_srec; + + /* The name and descriptor of the file that we are reading from. */ + int fd; + gzFile gfd; + + mmu_update_t mmu_updates[MAX_MMU_UPDATES]; + int mmu_update_idx = 0; + + if ( (fd = open(state_file, O_RDONLY)) == -1 ) + { + PERROR("Could not open state file for reading"); + return 1; + } + + if ( (gfd = gzdopen(fd, "rb")) == NULL ) + { + ERROR("Could not allocate decompression state for state file"); + close(fd); + return 1; + } + + /* Start writing out the saved-domain record. */ + if ( !checked_read(gfd, signature, 16) || + (memcmp(signature, "XenoLinuxSuspend", 16) != 0) ) + { + ERROR("Unrecognised state format -- no signature found"); + goto out; + } + + if ( !checked_read(gfd, name, sizeof(name)) || + !checked_read(gfd, &nr_pfns, sizeof(unsigned long)) || + !checked_read(gfd, &ctxt, sizeof(ctxt)) || + !checked_read(gfd, shared_info, PAGE_SIZE) || + !checked_read(gfd, pfn_to_mfn_frame_list, PAGE_SIZE) ) + { + ERROR("Error when reading from state file"); + goto out; + } + + for ( i = 0; i < MAX_DOMAIN_NAME; i++ ) + { + if ( name[i] == '\0' ) break; + if ( name[i] & 0x80 ) + { + ERROR("Random characters in domain name"); + goto out; + } + } + name[MAX_DOMAIN_NAME-1] = '\0'; + + if ( nr_pfns > 1024*1024 ) + { + ERROR("Invalid state file -- pfn count out of range"); + goto out; + } + + /* We want zeroed memory so use calloc rather than malloc. */ + pfn_to_mfn_table = calloc(1, 4 * nr_pfns); + pfn_type = calloc(1, 4 * nr_pfns); + + if ( (pfn_to_mfn_table == NULL) || (pfn_type == NULL) ) + { + errno = ENOMEM; + goto out; + } + + if ( !checked_read(gfd, pfn_type, 4 * nr_pfns) ) + { + ERROR("Error when reading from state file"); + goto out; + } + + /* Create a new domain of the appropriate size, and find it's dom_id. */ + op.cmd = DOM0_CREATEDOMAIN; + op.u.createdomain.memory_kb = nr_pfns * (PAGE_SIZE / 1024); + memcpy(op.u.createdomain.name, name, MAX_DOMAIN_NAME); + if ( do_dom0_op(&op) < 0 ) + { + ERROR("Could not create new domain"); + goto out; + } + dom = op.u.createdomain.domain; + + /* Get the domain's shared-info frame. */ + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = dom; + if ( do_dom0_op(&op) < 0 ) + { + ERROR("Could not get information on new domain"); + goto out; + } + shared_info_frame = op.u.getdomaininfo.shared_info_frame; + + if ( init_pfn_mapper() < 0 ) + goto out; + + /* Copy saved contents of shared-info page. No checking needed. */ + ppage = map_pfn(shared_info_frame); + memcpy(ppage, shared_info, PAGE_SIZE); + unmap_pfn(ppage); + + /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */ + if ( get_pfn_list(dom, pfn_to_mfn_table, nr_pfns) != nr_pfns ) + { + ERROR("Did not read correct number of frame numbers for new dom"); + goto out; + } + + verbose_printf("Reloading memory pages: 0%%"); + + /* + * Now simply read each saved frame into its new machine frame. + * We uncanonicalise page tables as we go. + */ + prev_pc = 0; + for ( i = 0; i < nr_pfns; i++ ) + { + this_pc = (i * 100) / nr_pfns; + if ( (this_pc - prev_pc) >= 5 ) + { + verbose_printf("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + mfn = pfn_to_mfn_table[i]; + + if ( !checked_read(gfd, page, PAGE_SIZE) ) + { + ERROR("Error when reading from state file"); + goto out; + } + + ppage = map_pfn(mfn); + switch ( pfn_type[i] ) + { + case L1TAB: + memset(ppage, 0, PAGE_SIZE); + if ( add_mmu_update(mmu_updates, &mmu_update_idx, + (mfn<> PAGE_SHIFT) >= nr_pfns ) + { + ERROR("Frame number in page table is out of range"); + goto out; + } + if ( (pfn_type[pfn] != NONE) && (page[j] & _PAGE_RW) ) + { + ERROR("Write access requested for a restricted frame"); + goto out; + } + page[j] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT); + page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; + } + if ( add_mmu_update(mmu_updates, &mmu_update_idx, + (unsigned long)&ppage[j], page[j]) ) + goto out; + } + break; + case L2TAB: + memset(ppage, 0, PAGE_SIZE); + if ( add_mmu_update(mmu_updates, &mmu_update_idx, + (mfn<>L2_PAGETABLE_SHIFT); j++ ) + { + if ( page[j] & _PAGE_PRESENT ) + { + if ( (pfn = page[j] >> PAGE_SHIFT) >= nr_pfns ) + { + ERROR("Frame number in page table is out of range"); + goto out; + } + if ( pfn_type[pfn] != L1TAB ) + { + ERROR("Page table mistyping"); + goto out; + } + /* Haven't reached the L1 table yet. Ensure it is safe! */ + if ( pfn > i ) + { + unsigned long **l1 = map_pfn(pfn_to_mfn_table[pfn]); + memset(l1, 0, PAGE_SIZE); + unmap_pfn(l1); + } + page[j] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE); + page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; + } + if ( add_mmu_update(mmu_updates, &mmu_update_idx, + (unsigned long)&ppage[j], page[j]) ) + goto out; + } + break; + default: + memcpy(ppage, page, PAGE_SIZE); + break; + } + /* NB. Must flush before unmapping page, as pass VAs to Xen. */ + if ( flush_mmu_updates(mmu_updates, &mmu_update_idx) ) + goto out; + unmap_pfn(ppage); + + if ( add_mmu_update(mmu_updates, &mmu_update_idx, + (mfn<= nr_pfns) || (pfn_type[pfn] != NONE) ) + { + ERROR("Suspend record frame number is bad"); + goto out; + } + ctxt.i386_ctxt.esi = mfn = pfn_to_mfn_table[pfn]; + p_srec = map_pfn(mfn); + p_srec->resume_info.nr_pages = nr_pfns; + p_srec->resume_info.shared_info = shared_info_frame << PAGE_SHIFT; + p_srec->resume_info.dom_id = dom; + p_srec->resume_info.flags = 0; + unmap_pfn(p_srec); + + /* Uncanonicalise each GDT frame number. */ + if ( ctxt.gdt_ents > 8192 ) + { + ERROR("GDT entry count out of range"); + goto out; + } + for ( i = 0; i < ctxt.gdt_ents; i += 512 ) + { + pfn = ctxt.gdt_frames[i]; + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) + { + ERROR("GDT frame number is bad"); + goto out; + } + ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn]; + } + + /* Uncanonicalise the page table base pointer. */ + pfn = ctxt.pt_base >> PAGE_SHIFT; + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != L2TAB) ) + { + ERROR("PT base is bad"); + goto out; + } + ctxt.pt_base = pfn_to_mfn_table[pfn] << PAGE_SHIFT; + + /* Uncanonicalise the pfn-to-mfn table frame-number list. */ + for ( i = 0; i < nr_pfns; i += 1024 ) + { + unsigned long copy_size = (nr_pfns - i) * sizeof(unsigned long); + if ( copy_size > PAGE_SIZE ) copy_size = PAGE_SIZE; + pfn = pfn_to_mfn_frame_list[i/1024]; + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) + { + ERROR("PFN-to-MFN frame number is bad"); + goto out; + } + ppage = map_pfn(pfn_to_mfn_table[pfn]); + memcpy(ppage, &pfn_to_mfn_table[i], copy_size); + unmap_pfn(ppage); + } + + /* + * Safety checking of saved context: + * 1. i386_ctxt is fine, as Xen checks that on context switch. + * 2. i387_ctxt is fine, as it can't hurt Xen. + * 3. trap_ctxt needs the code selectors checked. + * 4. fast_trap_idx is checked by Xen. + * 5. ldt base must be page-aligned, no more than 8192 ents, ... + * 6. gdt already done, and further checking is done by Xen. + * 7. check that ring1_ss is safe. + * 8. pt_base is already done. + * 9. debugregs are checked by Xen. + * 10. callback code selectors need checking. + */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + if ( (ctxt.trap_ctxt[i].cs & 3) == 0 ) + ctxt.trap_ctxt[i].cs = FLAT_RING1_CS; + } + if ( (ctxt.ring1_ss & 3) == 0 ) + ctxt.ring1_ss = FLAT_RING1_DS; + if ( (ctxt.event_callback_cs & 3) == 0 ) + ctxt.event_callback_cs = FLAT_RING1_CS; + if ( (ctxt.failsafe_callback_cs & 3) == 0 ) + ctxt.failsafe_callback_cs = FLAT_RING1_CS; + if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) || + (ctxt.ldt_ents > 8192) || + (ctxt.ldt_base > HYPERVISOR_VIRT_START) || + ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) ) + { + ERROR("Bad LDT base or size"); + goto out; + } + + op.cmd = DOM0_BUILDDOMAIN; + op.u.builddomain.domain = dom; + op.u.builddomain.num_vifs = 1; + memcpy(&op.u.builddomain.ctxt, &ctxt, sizeof(ctxt)); + rc = do_dom0_op(&op); + + out: + if ( rc != 0 ) + { + if ( dom != 0 ) + { + op.cmd = DOM0_DESTROYDOMAIN; + op.u.destroydomain.domain = dom; + op.u.destroydomain.force = 1; + (void)do_dom0_op(&op); + } + } + else + { + /* Success: print the domain id. */ + verbose_printf("DOM=%ld\n", dom); + } + + if ( pfn_to_mfn_table != NULL ) + free(pfn_to_mfn_table); + if ( pfn_type != NULL ) + free(pfn_type); + + gzclose(gfd); + + return (rc == 0) ? dom : rc; +} diff --git a/tools/libxi/libxi_linux_save.c b/tools/libxi/libxi_linux_save.c new file mode 100644 index 0000000000..d651ba44b2 --- /dev/null +++ b/tools/libxi/libxi_linux_save.c @@ -0,0 +1,380 @@ +/****************************************************************************** + * libxi_linux_save.c + * + * Save the state of a running Xenolinux session. + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "libxi_private.h" +#include +#include + +/* This may allow us to create a 'quiet' command-line option, if necessary. */ +#define verbose_printf(_f, _a...) \ + do { \ + if ( !verbose ) break; \ + printf( _f , ## _a ); \ + fflush(stdout); \ + } while ( 0 ) + +/* + * Returns TRUE if the given machine frame number has a unique mapping + * in the guest's pseudophysical map. + */ +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ + (((_mfn) < (1024*1024)) && \ + (pfn_to_mfn_table[mfn_to_pfn_table[_mfn]] == (_mfn))) + +/* Returns TRUE if MFN is successfully converted to a PFN. */ +#define translate_mfn_to_pfn(_pmfn) \ +({ \ + unsigned long mfn = *(_pmfn); \ + int _res = 1; \ + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \ + _res = 0; \ + else \ + *(_pmfn) = mfn_to_pfn_table[mfn]; \ + _res; \ +}) + +static int check_pfn_ownership(unsigned long mfn, unsigned int dom) +{ + dom0_op_t op; + op.cmd = DOM0_GETPAGEFRAMEINFO; + op.u.getpageframeinfo.pfn = mfn; + if ( (do_dom0_op(&op) < 0) || (op.u.getpageframeinfo.domain != dom) ) + return 0; + return 1; +} + +#define GETPFN_ERR (~0U) +static unsigned int get_pfn_type(unsigned long mfn) +{ + dom0_op_t op; + op.cmd = DOM0_GETPAGEFRAMEINFO; + op.u.getpageframeinfo.pfn = mfn; + if ( do_dom0_op(&op) < 0 ) + { + PERROR("Unexpected failure when getting page frame info!"); + return GETPFN_ERR; + } + return op.u.getpageframeinfo.type; +} + +static int checked_write(gzFile fd, void *buf, size_t count) +{ + int rc; + while ( ((rc = gzwrite(fd, buf, count)) == -1) && (errno = EINTR) ) + continue; + return rc == count; +} + +int xi_linux_save(unsigned int domid, const char *state_file, int verbose) +{ + dom0_op_t op; + int rc = 1, i, j; + unsigned long mfn; + unsigned int prev_pc, this_pc; + + /* Remember if we stopped the guest, so we can restart it on exit. */ + int we_stopped_it = 0; + + /* The new domain's shared-info frame number. */ + unsigned long shared_info_frame; + + /* A copy of the CPU context of the guest. */ + full_execution_context_t ctxt; + + /* A copy of the domain's name. */ + char name[MAX_DOMAIN_NAME]; + + /* A table containg the type of each PFN (/not/ MFN!). */ + unsigned long *pfn_type = NULL; + + /* A temporary mapping, and a copy, of one frame of guest memory. */ + unsigned long *ppage, page[1024]; + + /* A temporary mapping, and a copy, of the pfn-to-mfn table frame list. */ + unsigned long *p_pfn_to_mfn_frame_list, pfn_to_mfn_frame_list[1024]; + /* A temporary mapping of one frame in the above list. */ + unsigned long *pfn_to_mfn_frame; + + /* A table mapping each PFN to its current MFN. */ + unsigned long *pfn_to_mfn_table = NULL; + /* A table mapping each current MFN to its canonical PFN. */ + unsigned long *mfn_to_pfn_table = NULL; + + /* A temporary mapping, and a copy, of the guest's suspend record. */ + suspend_record_t *p_srec, srec; + + /* The name and descriptor of the file that we are writing to. */ + int fd; + gzFile gfd; + + if ( (fd = open(state_file, O_CREAT|O_EXCL|O_WRONLY, 0644)) == -1 ) + { + PERROR("Could not open file for writing"); + return 1; + } + + /* + * Compression rate 1: we want speed over compression. We're mainly going + * for those zero pages, after all. + */ + if ( (gfd = gzdopen(fd, "wb1")) == NULL ) + { + ERROR("Could not allocate compression state for state file"); + close(fd); + return 1; + } + + /* Ensure that the domain exists, and that it is stopped. */ + for ( ; ; ) + { + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = domid; + if ( (do_dom0_op(&op) < 0) || (op.u.getdomaininfo.domain != domid) ) + { + PERROR("Could not get info on domain"); + goto out; + } + + memcpy(&ctxt, &op.u.getdomaininfo.ctxt, sizeof(ctxt)); + memcpy(name, op.u.getdomaininfo.name, sizeof(name)); + shared_info_frame = op.u.getdomaininfo.shared_info_frame; + + if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED ) + break; + + we_stopped_it = 1; + + op.cmd = DOM0_STOPDOMAIN; + op.u.stopdomain.domain = domid; + (void)do_dom0_op(&op); + + sleep(1); + } + + /* A cheesy test to see whether the domain contains valid state. */ + if ( ctxt.pt_base == 0 ) + { + ERROR("Domain is not in a valid Xenolinux state"); + goto out; + } + + if ( init_pfn_mapper() < 0 ) + goto out; + + /* Is the suspend-record MFN actually valid for this domain? */ + if ( !check_pfn_ownership(ctxt.i386_ctxt.esi, domid) ) + { + ERROR("Invalid state record pointer"); + goto out; + } + + /* If the suspend-record MFN is okay then grab a copy of it to @srec. */ + p_srec = map_pfn(ctxt.i386_ctxt.esi); + memcpy(&srec, p_srec, sizeof(srec)); + unmap_pfn(p_srec); + + if ( srec.nr_pfns > 1024*1024 ) + { + ERROR("Invalid state record -- pfn count out of range"); + goto out; + } + + if ( !check_pfn_ownership(srec.pfn_to_mfn_frame_list, domid) ) + { + ERROR("Invalid pfn-to-mfn frame list pointer"); + goto out; + } + + /* Grab a copy of the pfn-to-mfn table frame list. */ + p_pfn_to_mfn_frame_list = map_pfn(srec.pfn_to_mfn_frame_list); + memcpy(pfn_to_mfn_frame_list, p_pfn_to_mfn_frame_list, PAGE_SIZE); + unmap_pfn(p_pfn_to_mfn_frame_list); + + /* We want zeroed memory so use calloc rather than malloc. */ + mfn_to_pfn_table = calloc(1, 4 * 1024 * 1024); + pfn_to_mfn_table = calloc(1, 4 * srec.nr_pfns); + pfn_type = calloc(1, 4 * srec.nr_pfns); + + if ( (mfn_to_pfn_table == NULL) || + (pfn_to_mfn_table == NULL) || + (pfn_type == NULL) ) + { + errno = ENOMEM; + goto out; + } + + + /* + * Construct the local pfn-to-mfn and mfn-to-pfn tables. On exit from this + * loop we have each MFN mapped at most once. Note that there may be MFNs + * that aren't mapped at all: we detect these by MFN_IS_IN_PSEUDOPHYS_MAP. + */ + pfn_to_mfn_frame = NULL; + for ( i = 0; i < srec.nr_pfns; i++ ) + { + /* Each frameful of table frames must be checked & mapped on demand. */ + if ( (i & 1023) == 0 ) + { + mfn = pfn_to_mfn_frame_list[i/1024]; + if ( !check_pfn_ownership(mfn, domid) ) + { + ERROR("Invalid frame number if pfn-to-mfn frame list"); + goto out; + } + if ( pfn_to_mfn_frame != NULL ) + unmap_pfn(pfn_to_mfn_frame); + pfn_to_mfn_frame = map_pfn(mfn); + } + + mfn = pfn_to_mfn_frame[i & 1023]; + + if ( !check_pfn_ownership(mfn, domid) ) + { + ERROR("Invalid frame specified with pfn-to-mfn table"); + goto out; + } + + /* Did we map this MFN already? That would be invalid! */ + if ( MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + { + ERROR("A machine frame appears twice in pseudophys space"); + goto out; + } + + pfn_to_mfn_table[i] = mfn; + mfn_to_pfn_table[mfn] = i; + + /* Query page type by MFN, but store it by PFN. */ + if ( (pfn_type[i] = get_pfn_type(mfn)) == GETPFN_ERR ) + goto out; + } + + /* Canonicalise the suspend-record frame number. */ + if ( !translate_mfn_to_pfn(&ctxt.i386_ctxt.esi) ) + { + ERROR("State record is not in range of pseudophys map"); + goto out; + } + + /* Canonicalise each GDT frame number. */ + for ( i = 0; i < ctxt.gdt_ents; i += 512 ) + { + if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) + { + ERROR("GDT frame is not in range of pseudophys map"); + goto out; + } + } + + /* Canonicalise the page table base pointer. */ + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) + { + ERROR("PT base is not in range of pseudophys map"); + goto out; + } + ctxt.pt_base = mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT; + + /* Canonicalise the pfn-to-mfn table frame-number list. */ + for ( i = 0; i < srec.nr_pfns; i += 1024 ) + { + if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ) + { + ERROR("Frame # in pfn-to-mfn frame list is not in pseudophys"); + goto out; + } + } + + /* Start writing out the saved-domain record. */ + ppage = map_pfn(shared_info_frame); + if ( !checked_write(gfd, "XenoLinuxSuspend", 16) || + !checked_write(gfd, name, sizeof(name)) || + !checked_write(gfd, &srec.nr_pfns, sizeof(unsigned long)) || + !checked_write(gfd, &ctxt, sizeof(ctxt)) || + !checked_write(gfd, ppage, PAGE_SIZE) || + !checked_write(gfd, pfn_to_mfn_frame_list, PAGE_SIZE) || + !checked_write(gfd, pfn_type, 4 * srec.nr_pfns) ) + { + ERROR("Error when writing to state file"); + goto out; + } + unmap_pfn(ppage); + + verbose_printf("Saving memory pages: 0%%"); + + /* Now write out each data page, canonicalising page tables as we go... */ + prev_pc = 0; + for ( i = 0; i < srec.nr_pfns; i++ ) + { + this_pc = (i * 100) / srec.nr_pfns; + if ( (this_pc - prev_pc) >= 5 ) + { + verbose_printf("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + mfn = pfn_to_mfn_table[i]; + + ppage = map_pfn(mfn); + memcpy(page, ppage, PAGE_SIZE); + unmap_pfn(ppage); + + if ( (pfn_type[i] == L1TAB) || (pfn_type[i] == L2TAB) ) + { + for ( j = 0; + j < ((pfn_type[i] == L2TAB) ? + (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); + j++ ) + { + if ( !(page[j] & _PAGE_PRESENT) ) continue; + mfn = page[j] >> PAGE_SHIFT; + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + { + ERROR("Frame number in pagetable page is invalid"); + goto out; + } + page[j] &= PAGE_SIZE - 1; + page[j] |= mfn_to_pfn_table[mfn] << PAGE_SHIFT; + } + } + + if ( !checked_write(gfd, page, PAGE_SIZE) ) + { + ERROR("Error when writing to state file"); + goto out; + } + } + + verbose_printf("\b\b\b\b100%%\nMemory saved.\n"); + + /* Success! */ + rc = 0; + + out: + /* Restart the domain if we had to stop it to save its state. */ + if ( we_stopped_it ) + { + op.cmd = DOM0_STARTDOMAIN; + op.u.startdomain.domain = domid; + (void)do_dom0_op(&op); + } + + gzclose(gfd); + + if ( pfn_to_mfn_table != NULL ) + free(pfn_to_mfn_table); + if ( mfn_to_pfn_table != NULL ) + free(mfn_to_pfn_table); + if ( pfn_type != NULL ) + free(pfn_type); + + /* On error, make sure the file is deleted. */ + if ( rc != 0 ) + unlink(state_file); + + return !!rc; +} diff --git a/tools/libxi/libxi_misc.c b/tools/libxi/libxi_misc.c new file mode 100644 index 0000000000..c8046283c1 --- /dev/null +++ b/tools/libxi/libxi_misc.c @@ -0,0 +1,50 @@ +/****************************************************************************** + * libxi_misc.c + * + * Miscellaneous control interface functions. + */ + +#include "libxi_private.h" + +int privcmd_fd = -1; + +int xi_interface_open(void) +{ + if ( (privcmd_fd == -1) && + ((privcmd_fd = open("/proc/xeno/privcmd", O_RDWR)) < 0) ) + { + privcmd_fd = -1; + return -1; + } + return 0; +} + +int xi_interface_close(void) +{ + if ( privcmd_fd != -1 ) + { + close(privcmd_fd); + privcmd_fd = -1; + } + return 0; +} + + +#define CONSOLE_RING_CLEAR 1 + +int xi_readconsolering(char *str, unsigned int max_chars, int clear) +{ + int ret; + dom0_op_t op; + + op.cmd = DOM0_READCONSOLE; + op.u.readconsole.str = (unsigned long)str; + op.u.readconsole.count = max_chars; + op.u.readconsole.cmd = clear ? CONSOLE_RING_CLEAR : 0; + + if ( (ret = do_dom0_op(&op)) > 0 ) + str[ret] = '\0'; + + return ret; +} + diff --git a/tools/libxi/libxi_private.c b/tools/libxi/libxi_private.c new file mode 100644 index 0000000000..9f9ace41c5 --- /dev/null +++ b/tools/libxi/libxi_private.c @@ -0,0 +1,34 @@ +/****************************************************************************** + * libxi_private.c + * + * Helper functions for the rest of the library. + */ + +#include "libxi_private.h" + +static int devmem_fd = -1; + +int init_pfn_mapper(void) +{ + if ( (devmem_fd == -1) && + ((devmem_fd = open("/dev/mem", O_RDWR)) < 0) ) + { + devmem_fd = -1; + return -1; + } + return 0; +} + +void *map_pfn(unsigned long pfn) +{ + void *vaddr = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, + MAP_SHARED, devmem_fd, pfn << PAGE_SHIFT); + if ( vaddr == MAP_FAILED ) + return NULL; + return vaddr; +} + +void unmap_pfn(void *vaddr) +{ + (void)munmap(vaddr, PAGE_SIZE); +} diff --git a/tools/libxi/libxi_private.h b/tools/libxi/libxi_private.h new file mode 100644 index 0000000000..2e75a05535 --- /dev/null +++ b/tools/libxi/libxi_private.h @@ -0,0 +1,155 @@ + +#ifndef __LIBXI_PRIVATE_H__ +#define __LIBXI_PRIVATE_H__ + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned long u32; +typedef unsigned long long u64; +typedef signed char s8; +typedef signed short s16; +typedef signed long s32; +typedef signed long long s64; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xi.h" + +#include +#include +#include +#include + +#define _PAGE_PRESENT 0x001 +#define _PAGE_RW 0x002 +#define _PAGE_USER 0x004 +#define _PAGE_PWT 0x008 +#define _PAGE_PCD 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_PAT 0x080 +#define _PAGE_PSE 0x080 +#define _PAGE_GLOBAL 0x100 + + +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 22 + +#define ENTRIES_PER_L1_PAGETABLE 1024 +#define ENTRIES_PER_L2_PAGETABLE 1024 + +#define PAGE_SHIFT L1_PAGETABLE_SHIFT +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef struct { unsigned long l1_lo; } l1_pgentry_t; +typedef struct { unsigned long l2_lo; } l2_pgentry_t; + +#define l1_table_offset(_a) \ + (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) +#define l2_table_offset(_a) \ + ((_a) >> L2_PAGETABLE_SHIFT) + +#define ERROR(_m) \ + fprintf(stderr, "ERROR: %s\n", (_m)) + +#define PERROR(_m) \ + fprintf(stderr, "ERROR: %s (%d = %s)\n", (_m), errno, strerror(errno)) + +extern int privcmd_fd; +static inline int do_privcmd(unsigned int cmd, unsigned long data) +{ + return ioctl(privcmd_fd, cmd, data); +} + +static inline int do_xen_hypercall(privcmd_hypercall_t *hypercall) +{ + return do_privcmd(IOCTL_PRIVCMD_HYPERCALL, (unsigned long)hypercall); +} + +static inline int do_dom0_op(dom0_op_t *op) +{ + int ret = -1; + privcmd_hypercall_t hypercall; + + op->interface_version = DOM0_INTERFACE_VERSION; + + hypercall.op = __HYPERVISOR_dom0_op; + hypercall.arg[0] = (unsigned long)op; + + if ( mlock(op, sizeof(*op)) != 0 ) + goto out1; + + if ( (ret = do_xen_hypercall(&hypercall)) < 0 ) + { + if ( errno == EACCES ) + fprintf(stderr, "Dom0 operation failed -- need to" + " rebuild the user-space tool set?\n"); + goto out2; + } + + ret = 0; + + out2: (void)munlock(op, sizeof(*op)); + out1: return ret; +} + +static inline int do_network_op(network_op_t *op) +{ + int ret = -1; + privcmd_hypercall_t hypercall; + + hypercall.op = __HYPERVISOR_network_op; + hypercall.arg[0] = (unsigned long)op; + + if ( mlock(op, sizeof(*op)) != 0 ) + goto out1; + + if ( (ret = do_xen_hypercall(&hypercall)) < 0 ) + goto out2; + + ret = 0; + + out2: (void)munlock(op, sizeof(*op)); + out1: return ret; +} + + +static inline int do_block_io_op(block_io_op_t *op) +{ + int ret = -1; + privcmd_hypercall_t hypercall; + + hypercall.op = __HYPERVISOR_block_io_op; + hypercall.arg[0] = (unsigned long)op; + + if ( mlock(op, sizeof(*op)) != 0 ) + goto out1; + + if ( do_xen_hypercall(&hypercall) < 0 ) + goto out2; + + ret = 0; + + out2: (void)munlock(op, sizeof(*op)); + out1: return ret; +} + +/* + * PFN mapping. + */ +int init_pfn_mapper(void); +void *map_pfn(unsigned long pfn); +void unmap_pfn(void *vaddr); + +#endif /* __LIBXI_PRIVATE_H__ */ diff --git a/tools/libxi/libxi_vbd.c b/tools/libxi/libxi_vbd.c new file mode 100644 index 0000000000..863f96ca13 --- /dev/null +++ b/tools/libxi/libxi_vbd.c @@ -0,0 +1,116 @@ +/****************************************************************************** + * libxi_vbd.c + * + * API for manipulating and accessing per-domain virtual block devices. + * + * Copyright (c) 2003, K A Fraser. + */ + +#define _GNU_SOURCE +#include "libxi_private.h" + +int xi_vbd_create(unsigned int domid, unsigned short vbdid, int writeable) +{ + block_io_op_t op; + op.cmd = BLOCK_IO_OP_VBD_CREATE; + op.u.create_params.domain = domid; + op.u.create_params.vdevice = vbdid; + op.u.create_params.mode = VBD_MODE_R | (writeable ? VBD_MODE_W : 0); + return do_block_io_op(&op); +} + + +int xi_vbd_destroy(unsigned int domid, unsigned short vbdid) +{ + block_io_op_t op; + op.cmd = BLOCK_IO_OP_VBD_DELETE; + op.u.delete_params.domain = domid; + op.u.delete_params.vdevice = vbdid; + return do_block_io_op(&op); +} + + +int xi_vbd_add_extent(unsigned int domid, + unsigned short vbdid, + unsigned short real_device, + unsigned long start_sector, + unsigned long nr_sectors) +{ + block_io_op_t op; + op.cmd = BLOCK_IO_OP_VBD_ADD; + op.u.add_params.domain = domid; + op.u.add_params.vdevice = vbdid; + op.u.add_params.extent.device = real_device; + op.u.add_params.extent.start_sector = start_sector; + op.u.add_params.extent.nr_sectors = nr_sectors; + return do_block_io_op(&op); +} + + +int xi_vbd_delete_extent(unsigned int domid, + unsigned short vbdid, + unsigned short real_device, + unsigned long start_sector, + unsigned long nr_sectors) +{ + block_io_op_t op; + op.cmd = BLOCK_IO_OP_VBD_REMOVE; + op.u.add_params.domain = domid; + op.u.add_params.vdevice = vbdid; + op.u.add_params.extent.device = real_device; + op.u.add_params.extent.start_sector = start_sector; + op.u.add_params.extent.nr_sectors = nr_sectors; + return do_block_io_op(&op); +} + + +int xi_vbd_probe(unsigned int domid, + unsigned short vbdid, + unsigned int max_vbds, + xi_vbd_t *vbds) +{ + block_io_op_t op; + xen_disk_info_t *xdi = &op.u.probe_params.xdi; + int i, j, ret, allocsz = max_vbds * sizeof(xen_disk_t); + + op.cmd = BLOCK_IO_OP_VBD_PROBE; + op.u.probe_params.domain = domid; + + xdi->max = max_vbds; + xdi->disks = malloc(allocsz); + xdi->count = 0; + + if ( (xdi->disks == NULL) || (mlock(xdi->disks, allocsz) != 0) ) + { + if ( xdi->disks != NULL ) + free(xdi->disks); + return -ENOMEM; + } + + ret = do_block_io_op(&op); + + (void)munlock(xdi->disks, allocsz); + + if ( ret >= 0 ) + { + for ( i = 0, j = 0; i < xdi->count; i++ ) + { + if ( !(xdi->disks[i].info & XD_FLAG_VIRT) ) + continue; + + vbds[j].domid = xdi->disks[i].domain; + vbds[j].vbdid = xdi->disks[i].device; + vbds[j].flags = (xdi->disks[i].info & XD_FLAG_RO) ? + 0 : XI_VBDF_WRITEABLE; + vbds[j].nr_sectors = xdi->disks[i].capacity; + + j++; + } + + ret = j; + } + + free(xdi->disks); + + return ret; +} diff --git a/tools/libxi/libxi_vif.c b/tools/libxi/libxi_vif.c new file mode 100644 index 0000000000..c95b45c030 --- /dev/null +++ b/tools/libxi/libxi_vif.c @@ -0,0 +1,66 @@ +/****************************************************************************** + * libxi_vif.c + * + * API for manipulating and accessing per-network-interface parameters. + * + * Copyright (c) 2003, K A Fraser. + */ + +#include "libxi_private.h" + +int xi_vif_scheduler_set(unsigned int domid, + unsigned int vifid, + xi_vif_sched_params_t *params) +{ + network_op_t netop; + netop.cmd = NETWORK_OP_VIFSETPARAMS; + netop.u.vif_setparams.domain = domid; + netop.u.vif_setparams.vif = vifid; + netop.u.vif_setparams.credit_bytes = params->credit_bytes; + netop.u.vif_setparams.credit_usec = params->credit_usec; + return do_network_op(&netop); +} + + +int xi_vif_scheduler_get(unsigned int domid, + unsigned int vifid, + xi_vif_sched_params_t *params) +{ + network_op_t netop; + int rc; + + netop.cmd = NETWORK_OP_VIFGETINFO; + netop.u.vif_getinfo.domain = domid; + netop.u.vif_getinfo.vif = vifid; + + if ( (rc = do_network_op(&netop)) >= 0 ) + { + params->credit_bytes = netop.u.vif_getinfo.credit_bytes; + params->credit_usec = netop.u.vif_getinfo.credit_usec; + } + + return rc; +} + + +int xi_vif_stats_get(unsigned int domid, + unsigned int vifid, + xi_vif_stats_t *stats) +{ + network_op_t netop; + int rc; + + netop.cmd = NETWORK_OP_VIFGETINFO; + netop.u.vif_getinfo.domain = domid; + netop.u.vif_getinfo.vif = vifid; + + if ( (rc = do_network_op(&netop)) >= 0 ) + { + stats->tx_bytes = netop.u.vif_getinfo.total_bytes_sent; + stats->tx_pkts = netop.u.vif_getinfo.total_packets_sent; + stats->rx_bytes = netop.u.vif_getinfo.total_bytes_received; + stats->rx_pkts = netop.u.vif_getinfo.total_packets_received; + } + + return rc; +} diff --git a/tools/libxi/rpm.spec b/tools/libxi/rpm.spec new file mode 100644 index 0000000000..1a8f42a936 --- /dev/null +++ b/tools/libxi/rpm.spec @@ -0,0 +1,28 @@ +Summary: Xen control interface library +Name: xen-internal-library +Version: 1.2 +Release: 1 +License: Xen +Group: Xen +BuildRoot: %{staging} +%description +Library to make it easier to access the Xen control interfaces. + +%pre +%preun +%install +install -m 0755 -d $RPM_BUILD_ROOT/lib +install -m 0755 libxi.a $RPM_BUILD_ROOT/lib/libxi.a +install -m 0755 libxi.so $RPM_BUILD_ROOT/lib/libxi.so +install -m 0755 -d $RPM_BUILD_ROOT/include +install -m 0644 xi.h $RPM_BUILD_ROOT/include/xi.h +%clean +%post +%postun +%files +%defattr(-,root,root) +%dir /lib +/lib/libxi.a +/lib/libxi.so +%dir /include +/include/xi.h diff --git a/tools/libxi/xi.h b/tools/libxi/xi.h new file mode 100644 index 0000000000..6b167bb212 --- /dev/null +++ b/tools/libxi/xi.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * xi.h + * + * A library for low-level access to the Xen control interfaces. + * + * Copyright (c) 2003, K A Fraser. + */ + +#ifndef __XI_H__ +#define __XI_H__ + +int xi_interface_open(void); +int xi_interface_close(void); + +typedef struct { + unsigned int domid; + unsigned int cpu; + int has_cpu; + int stopped; + unsigned long nr_pages; + unsigned long long cpu_time; +#define XI_DOMINFO_MAXNAME 16 + char name[XI_DOMINFO_MAXNAME]; +} xi_dominfo_t; + +int xi_domain_create(unsigned int mem_kb, const char *name); +int xi_domain_start(unsigned int domid); +int xi_domain_stop(unsigned int domid); +int xi_domain_destroy(unsigned int domid, int force); +int xi_domain_getinfo(unsigned int first_domid, + unsigned int max_doms, + xi_dominfo_t *info); + +int xi_linux_save(unsigned int domid, const char *state_file, int verbose); +int xi_linux_restore(const char *state_file, int verbose); +int xi_linux_build(unsigned int domid, + const char *image_name, + const char *ramdisk_name, + const char *cmdline, + int verbose); + +int xi_bvtsched_global_set(unsigned long ctx_allow); +int xi_bvtsched_domain_set(unsigned int domid, + unsigned long mcuadv, + unsigned long warp, + unsigned long warpl, + unsigned long warpu); + +typedef struct { + unsigned long credit_bytes; + unsigned long credit_usec; +} xi_vif_sched_params_t; + +typedef struct { + unsigned long long tx_bytes, tx_pkts; + unsigned long long rx_bytes, rx_pkts; +} xi_vif_stats_t; + +int xi_vif_scheduler_set(unsigned int domid, + unsigned int vifid, + xi_vif_sched_params_t *params); +int xi_vif_scheduler_get(unsigned int domid, + unsigned int vifid, + xi_vif_sched_params_t *params); +int xi_vif_stats_get(unsigned int domid, + unsigned int vifid, + xi_vif_stats_t *stats); + +typedef struct { +#define XI_VBDDOM_PROBE_ALL (~0U) + unsigned int domid; + unsigned short vbdid; +#define XI_VBDF_WRITEABLE (1<<0) + unsigned long flags; + unsigned long nr_sectors; +} xi_vbd_t; + + +int xi_vbd_create(unsigned int domid, unsigned short vbdid, int writeable); +int xi_vbd_destroy(unsigned int domid, unsigned short vbdid); +int xi_vbd_add_extent(unsigned int domid, + unsigned short vbdid, + unsigned short real_device, + unsigned long start_sector, + unsigned long nr_sectors); +int xi_vbd_delete_extent(unsigned int domid, + unsigned short vbdid, + unsigned short real_device, + unsigned long start_sector, + unsigned long nr_sectors); +int xi_vbd_probe(unsigned int domid, + unsigned short vbdid, + unsigned int max_vbds, + xi_vbd_t *vbds); + +int xi_readconsolering(char *str, unsigned int max_chars, int clear); + + +#endif /* __XI_H__ */ -- 2.30.2